{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import jieba\n",
    "import re\n",
    "import warnings\n",
    "from chinese_province_city_area_mapper.transformer import CPCATransformer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 读取数据\n",
    "df1 = pd.read_csv('comments.csv', names=['name', 'score', 'comment', 'date', 'href'])\n",
    "df2 = pd.read_csv('cities.csv', names=['city'])\n",
    "df = pd.merge(df1, df2, left_index=True, right_index=True, how='outer') # 根据索引合并数据\n",
    "\n",
    "df.drop('href', axis=1, inplace=True) # 去掉href列\n",
    "df.drop_duplicates(subset=None, keep='first', inplace=True) # 去重（这里没有重复值）\n",
    "df.dropna(axis=0) # 删除空值 (这里没有空值)\n",
    "\n",
    "# 去掉comment的span标签\n",
    "def comment_process(comment):\n",
    "    comment = comment.strip('<span class=\"short\">').strip('</span>').replace('\\n', '').replace('\\r', '')\n",
    "    p = re.compile('[^\\u4e00-\\u9fa5]')  # 中文编码范围\\u4e00到\\u9fa5\n",
    "    comment = re.sub(p,'',comment)\n",
    "    return comment\n",
    "\n",
    "df['comment'] = df['comment'].apply(comment_process) # 使用apply比循环要快\n",
    "\n",
    "# 评分转换数字\n",
    "df['score1'] = df['score']\n",
    "df['score'] = df['score'].map({\n",
    "    '力荐': 5,\n",
    "    '推荐': 4,\n",
    "    '还行': 3,\n",
    "    '较差': 2,\n",
    "    '很差': 1\n",
    "})\n",
    "df['date'] = pd.to_datetime(df['date']).dt.strftime('%Y-%m-%d')  # 将datetime字段由object转换成datetime类型，速度回快很多\n",
    "\n",
    "# 处理城市数据，如'讷河, 齐齐哈尔'提取为齐齐哈尔，'江苏南京'提取为南京，同时去除国外城市\n",
    "def city_process(line):\n",
    "    city = re.compile('[^\\u4e00-\\u9fa5]') # 中文编码范围\\u4e00到\\u9fa5\n",
    "    # 取出中文字符，返回列表\n",
    "    zh = re.split(city, line)\n",
    "    # 取列表中最后一个，例如'讷河, 齐齐哈尔'取齐齐哈尔\n",
    "    zh = zh[-1]\n",
    "    return zh\n",
    "\n",
    "df['city'] = df['city'].apply(city_process)\n",
    "# 提取出city中的市\n",
    "cpca = CPCATransformer()\n",
    "df['city'] = cpca.transform(df.city)['市']\n",
    "\n",
    "# df1 = df[df['city'] != ''] # 去除城市为空数据\n",
    "df.replace('北京市', '北京', inplace=True)\n",
    "df.replace('上海市', '上海', inplace=True)\n",
    "df.to_csv('data.csv', index=0, encoding='utf-8-sig')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [],
   "source": [
    "comment_cut = ''\n",
    "comments = df['comment'].tolist()\n",
    "\n",
    "for comment in comments:\n",
    "    comment = jieba.cut(comment)\n",
    "    comment = ' '.join(comment)\n",
    "    comment_cut += comment\n",
    "    \n",
    "df_comment = pd.DataFrame([{\n",
    "    'index' : '',\n",
    "    'comment' : ''\n",
    "}])\n",
    "comments = comment_cut.split(' ')\n",
    "i = 1\n",
    "for comment in comments:\n",
    "    insertRow = pd.DataFrame([{\n",
    "    'index' : str(i),\n",
    "    'comment' : comment\n",
    "    }])\n",
    "    df_comment = pd.concat([df_comment, insertRow], ignore_index=True)\n",
    "    i += 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_comment.drop([0], inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 108,
   "metadata": {},
   "outputs": [],
   "source": [
    "count = df_comment['comment'].value_counts()\n",
    "\n",
    "value = count.index.tolist()\n",
    "count = count[value].tolist()\n",
    "df_count = pd.DataFrame({\n",
    "    'value' : value,\n",
    "    'count' : count\n",
    "})\n",
    "\n",
    "def value_len(value):\n",
    "    return len(value)\n",
    "\n",
    "df_count['len'] = df_count['value'].apply(value_len)\n",
    "df_count = df_count[df_count['len'] > 1]\n",
    "df_count = df_count.iloc[:30]\n",
    "\n",
    "df_count.to_csv('count.csv', encoding='utf-8-sig')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
